import requests
from lxml import etree
import lxml.html
from mysqlDb.db import ConDb

con = ConDb()

school_url = "www.nthu.edu.tw"
school = "国立清华大学"
li = [
    {
        "name": "理學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=53"
    },
    {
        "name": "工學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=57"
    },
    {
        "name": "原子科學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=61"
    },
    {
        "name": "人文社會學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=63"
    },
    {
        "name": "生命科學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=65"
    },
    {
        "name": "電機資訊學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=69"
    },
    {
        "name": "科技管理學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=71"
    },
    {
        "name": "竹師教育學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=73"
    },
    {
        "name": "藝術學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=75"
    },
    {
        "name": "台北政經學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=77"
    },
    {
        "name": "清華學院",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=86"
    },
    {
        "name": "系所調整院務中心",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=87"
    },
    {
        "name": "其他單位",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=89"
    },
    {
        "name": "事務性單位",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=91"
    },
    {
        "name": "清華緊急救護團隊",
        "url": "http://tel.net.nthu.edu.tw/nthusearch/dept.php?dd=93"
    },
]

r = requests.session()
for xy in li:
    academy = xy["name"]
    # 学院url
    academy_url = xy["url"]

    html = r.get(academy_url).text
    etr = etree.HTML(html)
    lis = etr.xpath("//ul/li")
    # 循环学院
    for ul_li in lis:
        dom_l = lxml.html.tostring(ul_li)
        etr_l = etree.HTML(dom_l)
        subject = etr_l.xpath("//a/text()")[0]
        yx_url = etr_l.xpath("//a/@href")[0]
        # 人员列表的url
        list_url = "http://tel.net.nthu.edu.tw/nthusearch/" + yx_url
        print(list_url, subject)
        text = r.get(list_url).text
        # 具体列表
        etr_html = etree.HTML(text)
        tr_list = etr_html.xpath("//table[2]/tr")[1::]

        for tr in tr_list:
            dom_tr = lxml.html.tostring(tr)
            tr_etr = etree.HTML(dom_tr)
            tr_text = tr_etr.xpath("//td")

            i = 1
            name = ''
            professional = ''
            phone = ''
            email = ''
            for td in tr_text:
                dom_td = lxml.html.tostring(td)
                etr_td_dd = etree.HTML(dom_td)

                if i == 5:
                    email_list = etr_td_dd.xpath("//td/a/@href")
                    if len(email_list) > 0:
                        email = email_list[0]
                        i += 1
                    continue
                tt_td_list = etr_td_dd.xpath("//td/text()")
                if len(tt_td_list) > 0:
                    tt_td = tt_td_list[0]
                else:
                    i += 1
                    continue
                if i == 1:
                    name = str(tt_td).replace("\n", "").replace("\r", "").replace("\t", "")

                if i == 2:
                    professional = str(tt_td).replace("\n", "").replace("\r", "").replace("\t", "")

                if i == 3:
                    phone = str(tt_td).replace("\n", "").replace("\r", "").replace("\t", "")

                i += 1

            print(school_url, school, academy, academy_url, name, list_url, subject, professional, phone, email)
            li = [school_url, school, academy, academy_url, name, list_url, subject, professional, phone, email]
            con.insert_TB("talent_info", li,
                          "school_url", "school", "academy", "academy_url", "name",
                          "list_url", "subject", "professional", "phone", "email")
